/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: November 2024

Inputs: 	pdb_full_entities.dta
			pdb_full_structures.dta
			pdb_full_papers.dta
			pdb_analysis.dta
		
Outputs: 	Tables 1, 3
			Figure E3
			
Purpose: 	Tab / graph summary stats

*******************************************************************************/

* Merge together data (structure level)
	clear all
	use "${data_built}pdb_full_entities.dta", clear
	collapse (mean) before_pdb, by(structureId)
	merge 1:1 structureId using "${data_built}pdb_full_structures.dta"
	assert _merge == 3
	drop _merge
	merge m:1 pubmedId using "${data_built}pdb_full_papers.dta"
	assert _merge != 2
	drop _merge

* Generate structures per paper OR project
	gen structureCount = structuresInPaper
	replace structureCount = structuresInProject if structuresInPaper == .
	
* Generate a PubMed ID / project ID
	gen double pubmedProjectId = pubmedId
	replace pubmedProjectId = projectId if pubmedId == . 
	// note project ID is always greater than pubmed ID
	
* Generate a published dummy (has a pubmed ID)
	gen published = (pubmedId != .)
	
* Start by focusing on x-ray structures - this will be the full sample
	keep if experimentalTechnique == "X-RAY DIFFRACTION"
	gen fullSample = 1
	
* Merge in and mark the analysis sample
	merge m:1 structureId using "${data_built}pdb_analysis.dta"
	gen analysisSample = (_merge == 3)
	drop _merge	
	
* Mark non-SG and SG samples
	gen nonSGSample = (analysisSample == 1 & structuralGenomics == 0)
	gen SGSample = (analysisSample == 1 & structuralGenomics == 1)
	
* Divide some variables by 1000 for interpretation
	replace structureMolecularWt = structureMolecularWt / 1000
	replace residueCount = residueCount / 1000
	replace atomSiteCount = atomSiteCount / 1000

* Locals for variables
	local structure_stats refinementResolution rFree ramaRaw maturation 	///
		  priorityRace numStructureAuthors numPaperAuthors numEntities 		///
		  structureMolecularWt residueCount atomSiteCount before_pdb 		///
		  depositionYear
	
	local paper_stats structureCount published stk_cites_nslf3

/*------------------------------------------------------------------------------

	Table 1: Summary Statistics: Full Sample vs. Analysis Sample
	Table 3: Summary Statistics: Non-SG Sample vs. SG Sample

------------------------------------------------------------------------------*/	
	
* Loop over samples
foreach sample in full analysis nonSG SG {

	preserve
	
	* Keep if correct samples
	keep if `sample'Sample == 1
	
	* Initialize matrix
	matrix summary_stats_`sample' = J(18,6,.)
	local row = 1
	local col = 1

	*** Panel A: structure-level ***********************************************
	
	* Collapse to the structure level
	collapse `structure_stats' `paper_stats' pubmedProjectId, by(structureId)

		* Calculate summary stats
		foreach var in `structure_stats' {
			sum `var', detail
			matrix summary_stats_`sample'[`row', `col'] = r(mean)
			local ++col
			matrix summary_stats_`sample'[`row', `col'] = r(p50)
			local ++col
			matrix summary_stats_`sample'[`row', `col'] = r(sd)
			local ++col
			matrix summary_stats_`sample'[`row', `col'] = r(min)
			local ++col
			matrix summary_stats_`sample'[`row', `col'] = r(max)
			local ++col
			matrix summary_stats_`sample'[`row', `col'] = (_N - r(N)) / _N
			local col = 1
			local ++row
		}

		* Add sample size
		matrix summary_stats_`sample'[`row', `col'] = _N
		local ++row
	
	*** Panel B: paper / project-level *****************************************
		
	* Collapse to the paper / project level
	collapse (mean) `paper_stats', by(pubmedProjectId)
			
		* Calculate summary stats
		foreach var in `paper_stats' {
			sum `var', detail
			matrix summary_stats_`sample'[`row', `col'] = r(mean)
			local ++col
			matrix summary_stats_`sample'[`row', `col'] = r(p50)
			local ++col
			matrix summary_stats_`sample'[`row', `col'] = r(sd)
			local ++col
			matrix summary_stats_`sample'[`row', `col'] = r(min)
			local ++col
			matrix summary_stats_`sample'[`row', `col'] = r(max)
			local ++col
			matrix summary_stats_`sample'[`row', `col'] = (_N - r(N)) / _N
			local col = 1
			local ++row
		}

		* Add sample size
		matrix summary_stats_`sample'[`row', `col'] = _N
		local ++row
		
	restore
}

* Combine full and analysis into a single table, and save
matrix summary_stats = summary_stats_full, summary_stats_analysis
preserve
	clear
	svmat summary_stats
	export excel "${tables}/Tables.xlsx", sheet(table1) cell(D60) sheetmodify
restore		

* Combine non-SG and SG into a single table, and save
matrix summary_stats_sg = summary_stats_nonSG, summary_stats_SG
preserve
	clear
	svmat summary_stats_sg
	export excel "${tables}/Tables.xlsx", sheet(table3) cell(D60) sheetmodify
restore		

/*------------------------------------------------------------------------------

	Figure E3: Distributions of Key Outcome Variables

------------------------------------------------------------------------------*/

* Histograms of key variables in analysis samples
keep if analysisSample == 1

winsor2 refinementResolution, replace cuts (0 99.9) trim
twoway  hist refinementResolution, start(0) bin(25) fcolor(navy%30)			///			 
		lcolor(navy%30) frac 												///
		xtitle("Refinement resolution (lower is better)") ytitle("Density")	///
		title("Panel A: Quality (Resolution)") name(resolution, replace) 	///
		ylab(0(0.2)1)

winsor2 ramaRaw, replace cuts (0 99.9) trim
twoway  hist rFree, start(0) bin(25) fcolor(navy%30) lcolor(navy%30) frac 	///
		xtitle("R-free (lower is better)") ytitle("Density")				///
		title("Panel B: Quality (R-free)") name(rFree, replace) ylab(0(0.2)1)

winsor2 ramaRaw, replace cuts (0 99.9) trim
twoway  hist ramaRaw, start(0) bin(25) fcolor(navy%30) lcolor(navy%30) frac ///
		xtitle("Ramachandran outliers (lower is better)") ytitle("Density")	///
		title("Panel C: Quality (Ramachandran outliers)") 					///
		name(ramaRaw, replace) ylab(0(0.2)1)
	
winsor2 maturation, replace cuts (0 99.9) trim
twoway  hist maturation, start(0) bin(25) fcolor(navy%30) lcolor(navy%30)  	///
		frac xtitle("Yrs btw collection and deposit") ytitle("Density")		///
		title("Panel D: Maturation") name(maturation, replace) ylab(0(0.2)1)
		
twoway  hist priorityRace, discrete start(0) fcolor(navy%30) 				///
		lcolor(navy%30)xtitle("Priority race indicator") ytitle("Density")	///
		title("Panel E: Competition") name(competition, replace) 			///
		ylab(0(0.2)1) xlab(0(1)1)
		
winsor2 numStructureAuthors, replace cuts (0 99.9) trim
twoway  hist numStructureAuthors, start(0) bin(25) fcolor(navy%30) 			///
		lcolor(navy%30) frac 												///
		xtitle("Number of structure authors") ytitle("Density")				///
		title("Panel F: Investment (structure authors)") 					///
		name(structureAuthors, replace) ylab(0(0.2)1)
		
winsor2 numPaperAuthors, replace cuts (0 99.9) trim
twoway  hist numPaperAuthors, start(0) bin(25) fcolor(navy%30) 				///
		lcolor(navy%30) frac 												///
		xtitle("Number of paper authors") ytitle("Density")	ylab(0(0.2)1)	///
		title("Panel G: Investment (paper authors)") name(paperAuthors, replace) 
		
graph combine resolution rFree ramaRaw maturation competition 				///
	  structureAuthors paperAuthors

	  graph save "${figures}figureE3.gph", replace
graph export "${figures}figureE3.pdf", replace
